Table of Contents
- 1 Loading and preparing data
- 2 Ranking color-semantic associations in word embeddings
- 3 Creating datasets for statistical models
- 4 Nameability of color-dimension associations
- 4.0.1 Exporting names generated by participants for use in training corpus filtering
- 4.0.2 Correlating COCA-fiction cosine similarities to nameability measures
- 4.0.3 Correlating group-averaged human ratings to nameability measure differentials.
- 4.0.4 Correlation of group-averaged split-inverse ratings with nameability measures
- 5 Extracting non-color nearest neighbors for each dimension
- 6 More figures
- 7 Convert notebook to html
%matplotlib inline
%config InlineBackend.figure_format='retina'
from IPython.display import display, display_markdown
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import subprocess as sp
import numpy as np
import pandas as pd
import seaborn as sns
import arviz as az
import bambi
import copy
import warnings
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 8]
plt.rcParams['figure.dpi'] = 300
from scipy.stats import pearsonr, spearmanr
from itertools import chain
from collections import Counter
from subs2vec.utensils import log_timer
from subs2vec.vecs import Vectors
from subs2vec.neighbors import compute_nn
def display_md(md, **kwargs):
return display_markdown(md, raw=True, **kwargs)
def convert_notebook(title, output='html'):
convert = sp.run(f'jupyter nbconvert {title}.ipynb --to {output} --output {title}.{output}'.split(' '))
if convert.returncode == 0:
display_md(f'Jupyter notebook `{title}` converted successfully.')
else:
display_md(f'Error: encountered problem converting Jupyter notebook `{title}`')
def download(fname):
dl = sp.run(f'wget {fname}'.split(' '))
if dl.returncode == 0:
display_md(f'Download of `{fname}` succesful.')
else:
display_md(f'Download of `{fname}` failed.')
@log_timer
def filter_vecs(vecs, filter_words):
filtered_vecs = copy.deepcopy(vecs)
filtered_vecs.vectors = filtered_vecs.vectors[np.isin(filtered_vecs.words, filter_words)]
filtered_vecs.words = filtered_vecs.words[np.isin(filtered_vecs.words, filter_words)]
filtered_vecs.n = len(filtered_vecs.words)
display_md(f'Filtered {vecs.n} vectors, {filtered_vecs.n} remaining.')
return filtered_vecs
def norm(x):
return x / np.linalg.norm(x, 2)
sns.set(style='whitegrid')
pd.options.mode.chained_assignment = None
WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
Loading and preparing data¶
Loading original participant data (NZ)¶
df = pd.read_csv('data/saysani_data.tsv', sep='\t')
display(df)
| participant | white | red | orange | yellow | green | blue | purple | brown | black | dimension | group | pp_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 7 | 7 | 5 | 1 | 1 | 1 | 3 | 7 | cold-hot | sighted | sighted_1 |
| 1 | 1 | 7 | 1 | 4 | 2 | 3 | 3 | 6 | 6 | 7 | ripe-unripe | sighted | sighted_1 |
| 2 | 1 | 1 | 5 | 6 | 7 | 4 | 2 | 3 | 7 | 6 | new-old | sighted | sighted_1 |
| 3 | 1 | 1 | 7 | 2 | 1 | 4 | 2 | 3 | 5 | 7 | submissive-aggressive | sighted | sighted_1 |
| 4 | 1 | 1 | 7 | 6 | 1 | 2 | 2 | 5 | 3 | 5 | selfless-jealous | sighted | sighted_1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 539 | 12 | 1 | 2 | 1 | 2 | 2 | 3 | 2 | 3 | 4 | soft-hard | blind | blind_12 |
| 540 | 12 | 4 | 3 | 3 | 4 | 2 | 2 | 3 | 2 | 5 | light-heavy | blind | blind_12 |
| 541 | 12 | 2 | 4 | 2 | 1 | 2 | 1 | 2 | 3 | 2 | relaxed-tense | blind | blind_12 |
| 542 | 12 | 4 | 2 | 1 | 1 | 1 | 3 | 2 | 3 | 5 | alive-dead | blind | blind_12 |
| 543 | 12 | 6 | 7 | 4 | 3 | 4 | 4 | 1 | 2 | 5 | fast-slow | blind | blind_12 |
544 rows × 13 columns
Melt dataframe to long format¶
# these are the colors in the data
colors = ['white', 'red', 'orange', 'yellow', 'green', 'blue', 'purple', 'brown', 'black']
# melt
df_orig = df.melt(
id_vars=['group', 'dimension', 'pp_id'],
value_vars=colors,
var_name='color',
value_name='rating',
)
# pull out dimension words
dimension_labels = df_orig['dimension'].unique()
dimension_pairs = [pair.split('-') for pair in dimension_labels]
dimensions = list(chain(*dimension_pairs))
# add experiment and self vs. other variables for when we add the replication experiment later
df_orig['experiment'] = 'original'
df_orig['self_vs_other'] = 'self'
display(df_orig)
| group | dimension | pp_id | color | rating | experiment | self_vs_other | |
|---|---|---|---|---|---|---|---|
| 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self |
| 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self |
| 2 | sighted | new-old | sighted_1 | white | 1 | original | self |
| 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self |
| 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 4891 | blind | soft-hard | blind_12 | black | 4 | original | self |
| 4892 | blind | light-heavy | blind_12 | black | 5 | original | self |
| 4893 | blind | relaxed-tense | blind_12 | black | 2 | original | self |
| 4894 | blind | alive-dead | blind_12 | black | 5 | original | self |
| 4895 | blind | fast-slow | blind_12 | black | 5 | original | self |
4896 rows × 7 columns
Loading replication participant data (US)¶
df_rep = pd.read_csv('data/replication1_data.csv')
# little bit of data munging, drop test participant and catch trials
df_rep = df_rep[(df_rep['pp_id'] != 3) & (df_rep['question_type'] != 'catch')]
df_rep = df_rep.drop(columns=['question_type', 'prompt_pre_1'])
# melt to long format
df_rep = df_rep.melt(
id_vars=['dimension', 'color', 'pp_id'],
value_vars=['value', 'others_choice'],
var_name='self_vs_other',
value_name='rating',
)
# more data munging
df_rep['pp_id'] = 'sighted_' + df_rep['pp_id'].astype(str)
df_rep['self_vs_other'] = df_rep['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_rep['group'] = 'sighted'
df_rep['experiment'] = 'replication_1'
# there is a weird typo in one of the dimensions (?), so let's correct that here as well
df_rep['dimension'] = df_rep['dimension'].replace({'like-dis...like': 'like-dislike'})
display(df_rep)
| dimension | color | pp_id | self_vs_other | rating | group | experiment | |
|---|---|---|---|---|---|---|---|
| 0 | clean-dirty | yellow | sighted_69819 | self | 5 | sighted | replication_1 |
| 1 | soft-hard | yellow | sighted_69819 | self | 2 | sighted | replication_1 |
| 2 | ripe-unripe | yellow | sighted_69819 | self | 1 | sighted | replication_1 |
| 3 | selfless-jealous | yellow | sighted_69819 | self | 5 | sighted | replication_1 |
| 4 | high-low | yellow | sighted_69819 | self | 1 | sighted | replication_1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 9567 | like-dislike | orange | sighted_69785 | other | 4 | sighted | replication_1 |
| 9568 | new-old | orange | sighted_69785 | other | 4 | sighted | replication_1 |
| 9569 | clean-dirty | orange | sighted_69785 | other | 5 | sighted | replication_1 |
| 9570 | relaxed-tense | orange | sighted_69785 | other | 5 | sighted | replication_1 |
| 9571 | active-passive | orange | sighted_69785 | other | 3 | sighted | replication_1 |
9572 rows × 7 columns
Loading 2nd replication data (US, with reading measures)¶
df_read = pd.read_csv('data/replication2_data_with_reading.csv').drop(columns=['Unnamed: 0', 'X'])
display(df_read)
| dimension | group | subj_id | color | value | question_type | others_choice | art | fiction | nonfiction | ... | Q9_17 | Q9_18 | Q9_19 | Q9_20 | Q9_21 | composite_read | upper_art | upper_fiction | upper_nonfiction | upper_read_motivation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | replication-sighted | 69212 | brown | 4 | semantic_diff | 4 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 1 | ripe-unripe | replication-sighted | 69212 | brown | 7 | semantic_diff | 6 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 2 | new-old | replication-sighted | 69212 | brown | 6 | semantic_diff | 6 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 3 | submissive-aggressive | replication-sighted | 69212 | brown | 2 | semantic_diff | 2 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| 4 | selfless-jealous | replication-sighted | 69212 | brown | 5 | semantic_diff | 4 | 3.0 | 0.0 | 1.0 | ... | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.000000 | 0.0 | 0.0 | 1.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14251 | light-heavy | replication-sighted | 68129 | red | 6 | semantic_diff | 5 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14252 | relaxed-tense | replication-sighted | 68129 | red | 6 | semantic_diff | 5 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14253 | alive-dead | replication-sighted | 68129 | red | 7 | semantic_diff | 6 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14254 | fast-slow | replication-sighted | 68129 | red | 1 | semantic_diff | 3 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
| 14255 | high-low | replication-sighted | 68129 | red | 1 | semantic_diff | 2 | 11.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | -1.0 | 0.0 | 1.0 | -0.555556 | 1.0 | 0.0 | 1.0 | 0.0 |
14256 rows × 36 columns
Compute reading subscales¶
df_read['reading_motivation'] = df_read.apply(
lambda x: (0
+ x['Q9_1']
+ x['Q9_2']
+ x['Q9_3']
+ x['Q9_4']
+ x['Q9_5']
+ x['Q9_6']
+ x['Q9_7']
+ x['Q9_8']
+ x['Q9_9']
+ x['Q9_10']
+ x['Q9_11']
+ x['Q9_12']
+ x['Q9_13']
+ x['Q9_14'] * -1
+ x['Q9_15']
+ x['Q9_16']
+ x['Q9_17'] * -1
+ x['Q9_18']
+ x['Q9_19']
+ x['Q9_20']
+ x['Q9_21']) / 21,
axis=1)
df_read['reading_part_of_self'] = df_read.apply(
lambda x: (0
+ x['Q9_2']
+ x['Q9_3']
+ x['Q9_4']
+ x['Q9_5']
+ x['Q9_6']
+ x['Q9_9']
+ x['Q9_10']
+ x['Q9_11']) / 8,
axis=1)
df_read['reading_efficacy'] = df_read.apply(
lambda x: (0
+ x['Q9_1']
+ x['Q9_14'] * -1
+ x['Q9_16']
+ x['Q9_17'] * -1
+ x['Q9_19']
+ x['Q9_20']) / 6,
axis=1)
df_read['reading_recognition'] = df_read.apply(
lambda x: (0
+ x['Q9_12']
+ x['Q9_13']
+ x['Q9_15']) / 3,
axis=1)
df_read['reading_other_realms'] = df_read.apply(
lambda x: (0
+ x['Q9_7']
+ x['Q9_8']
+ x['Q9_18']
+ x['Q9_21']) / 4,
axis=1)
# rename participant id column to match earlier datasets
df_read = df_read.rename(columns={'subj_id': 'pp_id'})
# melt to long format
df_read = df_read.melt(
id_vars=['dimension', 'color', 'pp_id', 'art', 'fiction', 'nonfiction', 'reading_motivation',
'reading_part_of_self', 'reading_efficacy', 'reading_recognition', 'reading_other_realms'],
value_vars=['value', 'others_choice'],
var_name='self_vs_other',
value_name='rating',
)
# more data munging
df_read['pp_id'] = 'sighted_' + df_read['pp_id'].astype(str)
df_read['self_vs_other'] = df_read['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_read['group'] = 'sighted'
df_read['experiment'] = 'replication_2'
display(df_read)
| dimension | color | pp_id | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | self_vs_other | rating | group | experiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 4 | sighted | replication_2 |
| 1 | ripe-unripe | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 7 | sighted | replication_2 |
| 2 | new-old | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 6 | sighted | replication_2 |
| 3 | submissive-aggressive | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 2 | sighted | replication_2 |
| 4 | selfless-jealous | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 5 | sighted | replication_2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28507 | light-heavy | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 5 | sighted | replication_2 |
| 28508 | relaxed-tense | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 5 | sighted | replication_2 |
| 28509 | alive-dead | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 6 | sighted | replication_2 |
| 28510 | fast-slow | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 3 | sighted | replication_2 |
| 28511 | high-low | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 2 | sighted | replication_2 |
28512 rows × 15 columns
df_read.describe()
| art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | rating | |
|---|---|---|---|---|---|---|---|---|---|
| count | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 28512.000000 |
| mean | 7.616279 | 0.593023 | 0.755814 | -0.107973 | -0.280523 | 0.203488 | -0.616279 | 0.151163 | 3.693147 |
| std | 6.612596 | 0.854251 | 0.987569 | 0.646210 | 0.879559 | 0.646210 | 0.910077 | 0.759909 | 1.424941 |
| min | -5.000000 | 0.000000 | 0.000000 | -1.619048 | -2.000000 | -1.000000 | -2.000000 | -2.000000 | 1.000000 |
| 25% | 3.000000 | 0.000000 | 0.000000 | -0.571429 | -1.000000 | -0.333333 | -1.333333 | -0.500000 | 3.000000 |
| 50% | 6.000000 | 0.000000 | 0.000000 | -0.119048 | -0.375000 | 0.166667 | -0.666667 | 0.250000 | 4.000000 |
| 75% | 10.000000 | 1.000000 | 1.000000 | 0.285714 | 0.250000 | 0.666667 | 0.000000 | 0.500000 | 5.000000 |
| max | 26.000000 | 4.000000 | 4.000000 | 1.619048 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 7.000000 |
corrs = df_read[['art', 'fiction', 'nonfiction', 'reading_motivation', 'reading_part_of_self',
'reading_efficacy', 'reading_recognition', 'reading_other_realms']].corr().round(2)
mask = np.zeros_like(corrs)
mask[np.triu_indices_from(mask)] = True
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True, mask=mask)
g.set_yticklabels(g.get_yticklabels(), rotation=0);
g = sns.histplot(x='art', data=df_read)
Retrieving vectors for words and dimension word pairs¶
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs = filter_vecs(vecs, np.array(colors + dimensions))
vecs_dict = vecs.as_dict()
color_vecs = filter_vecs(vecs, np.array(colors))
dimension_vecs = filter_vecs(vecs, np.array(dimensions))
dimension_pair_vecs = np.vstack([norm(vecs_dict[pair[0]] - vecs_dict[pair[1]]) for pair in dimension_pairs])
dimension_neighbors = compute_nn(color_vecs, dimension_vecs.vectors, dimension_vecs.words, num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
'neighbor -1',
'neighbor -2',
'neighbor -3',
'neighbor -4',
'neighbor -5',
'neighbor -6',
'neighbor -7',
'neighbor -8',
'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.000 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x16c553250> ran in 0.001 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | like | white | black | yellow | orange | green | purple | blue | brown | red |
| 1 | old | white | brown | black | yellow | orange | green | blue | red | purple |
| 2 | new | black | white | green | yellow | red | blue | purple | orange | brown |
| 3 | light | yellow | orange | blue | green | red | purple | white | brown | black |
| 4 | hard | white | brown | black | red | orange | green | yellow | purple | blue |
| 5 | dead | black | white | brown | red | green | purple | yellow | orange | blue |
| 6 | cold | blue | black | white | green | brown | red | yellow | purple | orange |
| 7 | happy | white | orange | brown | yellow | purple | red | green | blue | black |
| 8 | hot | red | yellow | black | orange | purple | white | blue | green | brown |
| 9 | heavy | black | purple | red | blue | brown | green | yellow | orange | white |
| 10 | fast | red | white | yellow | black | blue | green | orange | brown | purple |
| 11 | soft | brown | green | yellow | purple | orange | red | white | blue | black |
| 12 | clean | white | blue | black | brown | yellow | green | red | orange | purple |
| 13 | slow | red | yellow | purple | blue | brown | black | green | orange | white |
| 14 | angry | orange | red | purple | black | white | yellow | blue | brown | green |
| 15 | alive | green | brown | orange | red | yellow | black | blue | purple | white |
| 16 | sad | brown | red | green | black | purple | yellow | blue | orange | white |
| 17 | fresh | green | red | white | blue | yellow | brown | black | purple | orange |
| 18 | calm | blue | green | white | brown | purple | red | black | yellow | orange |
| 19 | dirty | brown | yellow | blue | white | red | black | orange | green | purple |
| 20 | dull | brown | green | red | blue | yellow | orange | purple | black | white |
| 21 | relaxed | blue | white | green | yellow | red | purple | brown | orange | black |
| 22 | jealous | purple | red | black | orange | white | yellow | green | blue | brown |
| 23 | tense | white | black | blue | red | brown | green | orange | yellow | purple |
| 24 | exciting | green | orange | purple | blue | black | red | white | brown | yellow |
| 25 | active | orange | black | green | white | brown | red | blue | purple | yellow |
| 26 | ripe | orange | green | purple | yellow | red | brown | blue | black | white |
| 27 | aggressive | orange | yellow | black | white | brown | red | green | blue | purple |
| 28 | stale | brown | orange | yellow | white | green | red | purple | blue | black |
| 29 | dislike | purple | brown | black | orange | green | yellow | red | white | blue |
| 30 | passive | black | white | blue | green | red | brown | purple | orange | yellow |
| 31 | selfless | black | white | brown | blue | orange | purple | red | green | yellow |
| 32 | submissive | white | brown | black | purple | green | blue | orange | yellow | red |
| 33 | unripe | orange | purple | yellow | red | brown | green | black | blue | white |
Using dimension axes (word pair contrasts), with nearest neighbor (cosine) method¶
dimension_neighbors = compute_nn(color_vecs, dimension_pair_vecs, np.array(dimension_labels), num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
'neighbor -1',
'neighbor -2',
'neighbor -3',
'neighbor -4',
'neighbor -5',
'neighbor -6',
'neighbor -7',
'neighbor -8',
'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.000 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x16c553250> ran in 0.001 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | blue | green | brown | white | black | purple | yellow | red | orange |
| 1 | ripe-unripe | green | orange | purple | red | brown | yellow | blue | black | white |
| 2 | new-old | green | red | purple | black | yellow | blue | white | orange | brown |
| 3 | submissive-aggressive | purple | white | brown | black | blue | green | red | yellow | orange |
| 4 | selfless-jealous | brown | white | black | blue | orange | green | yellow | purple | red |
| 5 | active-passive | orange | green | brown | red | yellow | purple | black | blue | white |
| 6 | like-dislike | white | black | yellow | orange | blue | green | red | brown | purple |
| 7 | clean-dirty | white | blue | green | black | orange | purple | red | yellow | brown |
| 8 | fresh-stale | green | blue | red | white | black | purple | yellow | orange | brown |
| 9 | calm-angry | blue | green | brown | white | yellow | black | purple | red | orange |
| 10 | happy-sad | white | orange | yellow | purple | blue | green | red | black | brown |
| 11 | exciting-dull | orange | white | purple | black | blue | red | green | yellow | brown |
| 12 | soft-hard | green | purple | yellow | blue | orange | brown | red | white | black |
| 13 | light-heavy | yellow | orange | blue | white | green | red | purple | brown | black |
| 14 | relaxed-tense | purple | yellow | orange | blue | green | white | brown | red | black |
| 15 | alive-dead | green | orange | yellow | blue | red | brown | purple | white | black |
| 16 | fast-slow | white | black | green | orange | red | blue | yellow | brown | purple |
Creating datasets for statistical models¶
Merging data and predictors¶
Merge data¶
df_joint = pd.concat([df_orig, df_rep, df_read]).reset_index()
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
42980 rows × 16 columns
Add word frequency (Van Paridon & Thompson, 2020)¶
freqs = pd.read_csv('../datasets/dedup.en.words.unigrams.tsv', sep='\t') # not included in git repo
freqs['log_freq'] = np.log(freqs['unigram_freq'])
freqs = freqs.drop(columns='unigram_freq')
display(freqs.round(2))
| unigram | log_freq | |
|---|---|---|
| 0 | the | 17.10 |
| 1 | you | 17.06 |
| 2 | i | 17.04 |
| 3 | to | 16.78 |
| 4 | a | 16.59 |
| ... | ... | ... |
| 2397976 | tpar1 | 0.00 |
| 2397977 | giacoia | 0.00 |
| 2397978 | ourcinders | 0.00 |
| 2397979 | tourret | 0.00 |
| 2397980 | iroki | 0.00 |
2397981 rows × 2 columns
df_joint['word1'] = df_joint['dimension'].apply(lambda x: x.split('-')[0])
df_joint['word2'] = df_joint['dimension'].apply(lambda x: x.split('-')[1])
df_joint = df_joint.merge(freqs, left_on='word1', right_on='unigram', how='left')
df_joint = df_joint.merge(freqs, left_on='word2', right_on='unigram', how='left')
df_joint['frequency'] = df_joint['log_freq_x'] - df_joint['log_freq_y']
df_joint = df_joint.drop(columns=[
'unigram_x',
'unigram_y',
'log_freq_x',
'log_freq_y'
])
display(df_joint.round(2))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | word1 | word2 | frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | cold | hot | -0.22 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ripe | unripe | 3.49 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | new | old | 0.12 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | submissive | aggressive | -2.35 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | selfless | jealous | -2.96 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | light | heavy | 1.24 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | relaxed | tense | -0.23 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | alive | dead | -0.90 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | fast | slow | 0.76 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | high | low | 1.24 |
42980 rows × 19 columns
Add concreteness (Brysbaert et al., 2014)¶
concreteness = pd.read_csv('../datasets/en-brysbaert-2014.tsv', sep='\t') # not included in git repo
display(concreteness)
| word | concreteness | |
|---|---|---|
| 0 | a | 1.46 |
| 1 | aardvark | 4.68 |
| 2 | aback | 1.65 |
| 3 | abacus | 4.52 |
| 4 | abandon | 2.54 |
| ... | ... | ... |
| 37053 | zoologist | 4.30 |
| 37054 | zoology | 3.37 |
| 37055 | zoom | 3.10 |
| 37056 | zoophobia | 2.04 |
| 37057 | zucchini | 4.87 |
37058 rows × 2 columns
df_joint = df_joint.merge(concreteness, left_on='word1', right_on='word', how='left')
df_joint = df_joint.merge(concreteness, left_on='word2', right_on='word', how='left')
df_joint['concreteness'] = df_joint['concreteness_x'] - df_joint['concreteness_y']
df_joint = df_joint.drop(columns=[
'word_x',
'word_y',
'concreteness_x',
'concreteness_y'
])
display(df_joint.round(2))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | word1 | word2 | frequency | concreteness | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | cold | hot | -0.22 | -0.46 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ripe | unripe | 3.49 | -0.01 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | new | old | 0.12 | 0.09 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | submissive | aggressive | -2.35 | -0.82 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | selfless | jealous | -2.96 | -0.56 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | light | heavy | 1.24 | 0.84 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | relaxed | tense | -0.23 | 0.15 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | alive | dead | -0.90 | -0.93 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | fast | slow | 0.76 | 0.04 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | high | low | 1.24 | 0.12 |
42980 rows × 20 columns
Add Small World of Words associations (De Deyne et al., 2018)¶
swow = pd.read_csv('../datasets/SWOW-EN.R100.csv') # not included in git repo
display(swow)
| Unnamed: 0 | id | participantID | age | gender | nativeLanguage | country | education | created_at | cue | R1 | R2 | R3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 29 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | although | nevertheless | yet | but |
| 1 | 2 | 30 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | deal | no | cards | shake |
| 2 | 3 | 31 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | music | notes | band | rhythm |
| 3 | 4 | 32 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | inform | tell | rat on | NaN |
| 4 | 5 | 33 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | way | path | via | method |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1228195 | 1228196 | 1530300 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | strange | mask | weird | stranger |
| 1228196 | 1228197 | 1530290 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | sunset | sea | sky | clause |
| 1228197 | 1228198 | 1530291 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | useless | pitty | worthless | worth |
| 1228198 | 1228199 | 1530284 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | volume | loud | music | key |
| 1228199 | 1228200 | 1530288 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | whenever | who | where | always |
1228200 rows × 13 columns
def add_swow(df, swow, colname):
swow = pd.DataFrame(swow.groupby('cue')['resp'].value_counts()).rename(columns={'resp': 'n'})
swow = swow.reset_index()
df = df.merge(swow, left_on=['word1', 'color'], right_on=['cue', 'resp'], how='left')
df = df.merge(swow, left_on=['word2', 'color'], right_on=['cue', 'resp'], how='left')
df['n_x'] = df['n_x'].fillna(0)
df['n_y'] = df['n_y'].fillna(0)
df[colname] = df['n_x'] - df['n_y']
df = df.drop(columns=[
'cue_x',
'cue_y',
'resp_x',
'resp_y',
'n_x',
'n_y',
])
return df
swow = swow[swow['cue'].isin(dimensions)]
swow_NZ = swow[(swow['country'] == 'New Zealand')] # select only NZ respondents
swow_US = swow[(swow['country'] == 'United States')] # select only US respondents
# count only R1 (maximal discounting)
df_joint = add_swow(df_joint, swow.rename(columns={'R1': 'resp'}), 'swow_R1')
df_joint = add_swow(df_joint, swow_NZ.rename(columns={'R1': 'resp'}), 'swow_R1_NZ') # US
df_joint = add_swow(df_joint, swow_US.rename(columns={'R1': 'resp'}), 'swow_R1_US') # NZ
# count R1, R2, and R3 with equal weight (minimal discounting)
swow_all = swow.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all, 'swow_all')
# NZ
swow_all_NZ = swow_NZ.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_NZ, 'swow_all_NZ')
# US
swow_all_US = swow_US.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_US, 'swow_all_US')
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | word1 | word2 | frequency | concreteness | swow_R1 | swow_R1_NZ | swow_R1_US | swow_all | swow_all_NZ | swow_all_US | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | cold | hot | -0.216432 | -0.46 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | ripe | unripe | 3.485549 | -0.01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | new | old | 0.119068 | 0.09 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | submissive | aggressive | -2.352148 | -0.82 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | selfless | jealous | -2.955968 | -0.56 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | light | heavy | 1.240142 | 0.84 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | relaxed | tense | -0.229652 | 0.15 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | alive | dead | -0.904786 | -0.93 | -1.0 | 0.0 | 0.0 | -1.0 | 0.0 | 0.0 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | fast | slow | 0.763262 | 0.04 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | high | low | 1.237676 | 0.12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
42980 rows × 26 columns
# check how many participants gave green as a response to various cues (to use as an example in the paper)
counts = swow_all_US.groupby(['cue', 'resp']).count().reset_index()
display(counts[counts['resp'] == 'green'])
| cue | resp | id | participantID | created_at | variable | |
|---|---|---|---|---|---|---|
| 233 | alive | green | 1 | 1 | 1 | 1 |
| 508 | clean | green | 1 | 1 | 1 | 1 |
| 1108 | exciting | green | 1 | 1 | 1 | 1 |
| 1289 | fresh | green | 1 | 1 | 1 | 1 |
| 1456 | hard | green | 1 | 1 | 1 | 1 |
| 1706 | jealous | green | 20 | 20 | 20 | 20 |
| 1984 | new | green | 1 | 1 | 1 | 1 |
| 3010 | unripe | green | 18 | 18 | 18 | 18 |
display(df_joint.sort_values('swow_all'))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | word1 | word2 | frequency | concreteness | swow_R1 | swow_R1_NZ | swow_R1_US | swow_all | swow_all_NZ | swow_all_US | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25920 | 11452 | sighted | selfless-jealous | sighted_68676 | green | 2 | replication_2 | self | 4.0 | 0.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 5138 | 242 | sighted | selfless-jealous | sighted_68736 | green | 2 | replication_1 | self | NaN | NaN | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 37530 | 23062 | sighted | selfless-jealous | sighted_67653 | green | 6 | replication_2 | other | 10.0 | 4.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 25416 | 10948 | sighted | selfless-jealous | sighted_69192 | green | 7 | replication_2 | self | 9.0 | 1.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 16956 | 2488 | sighted | selfless-jealous | sighted_68719 | green | 5 | replication_2 | self | 3.0 | 0.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 466 | 466 | blind | clean-dirty | blind_8 | white | 2 | original | self | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 14129 | 9233 | sighted | clean-dirty | sighted_68738 | white | 1 | replication_1 | other | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 33057 | 18589 | sighted | light-heavy | sighted_68150 | white | 2 | replication_2 | other | 9.0 | 0.0 | ... | light | heavy | 1.240142 | 0.84 | 1.0 | 0.0 | 1.0 | 8.0 | 0.0 | 5.0 |
| 12790 | 7894 | sighted | clean-dirty | sighted_68946 | white | 1 | replication_1 | other | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 21537 | 7069 | sighted | light-heavy | sighted_67884 | white | 2 | replication_2 | self | 5.0 | 1.0 | ... | light | heavy | 1.240142 | 0.84 | 1.0 | 0.0 | 1.0 | 8.0 | 0.0 | 5.0 |
42980 rows × 26 columns
(It looks like there very few responses from NZ, but a little more from US and elsewhere.)
Add cosine distances (Mikolov et al., 2013)¶
def get_cosine(x, vecs_dict):
zero = np.zeros(300)
return np.dot(norm(vecs_dict.get(x['word2'], zero) - vecs_dict.get(x['word1'], zero)),
vecs_dict.get(x['color'], zero))
Common Crawl¶
vecs = Vectors('../embeddings/cc.en.300.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_cc'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/cc.en.300.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.176 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.056 seconds
Subtitles¶
vecs = Vectors('../embeddings/subs.en.1e6.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_subs'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/subs.en.1e6.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.236 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.043 seconds
COCA¶
# academic
vecs = Vectors('../embeddings/acad.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_acad'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# magazines
vecs = Vectors('../embeddings/mag.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_mag'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# spoken
vecs = Vectors('../embeddings/spok.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_spok'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# news
vecs = Vectors('../embeddings/news.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_news'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
display(df_joint.round(2))
[INFO] loading vectors ../embeddings/acad.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.205 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.044 seconds [INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.082 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.041 seconds [INFO] loading vectors ../embeddings/mag.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 5.965 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.040 seconds [INFO] loading vectors ../embeddings/spok.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.355 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.063 seconds [INFO] loading vectors ../embeddings/news.en.vec [INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.311 seconds [INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.039 seconds
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | swow_all | swow_all_NZ | swow_all_US | cosine_cc | cosine_subs | cosine_acad | cosine_fic | cosine_mag | cosine_spok | cosine_news | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.05 | 0.02 | 0.04 | 0.01 | -0.06 | 0.03 | -0.04 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.02 | 0.09 | 0.03 | 0.16 | 0.07 | -0.20 | 0.03 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.12 | 0.04 | 0.10 | 0.07 | 0.04 | 0.08 | 0.03 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | -0.08 | -0.08 | -0.09 | -0.01 | -0.07 | -0.05 | -0.03 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.05 | -0.01 | -0.01 | -0.01 | -0.01 | 0.11 | 0.01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | -0.16 | -0.04 | -0.06 | -0.08 | -0.16 | -0.18 | -0.08 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.11 | -0.04 | 0.09 | -0.02 | -0.01 | 0.04 | 0.06 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | -1.0 | 0.0 | 0.0 | 0.15 | 0.06 | 0.02 | 0.05 | -0.00 | 0.05 | 0.10 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.00 | -0.02 | -0.01 | -0.00 | -0.05 | 0.04 | -0.03 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | -0.01 | 0.04 | 0.01 | -0.04 | 0.06 | 0.12 | 0.08 |
42980 rows × 33 columns
Filtered COCA¶
COCA embeddings, but from COCA corpora without sentences with 1st order cooccurrences (sentences with a color word and a dimension word).
df_joint = pd.read_csv('data/data_plus_predictors.tsv', sep='\t')
vecs = Vectors('../embeddings/fic_cut.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_small'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/fic_cut.en.vec [INFO] <function Vectors.__init__ at 0x14f366170> ran in 4.590 seconds [INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.043 seconds
vecs = Vectors('../embeddings/fic_no_1st_order_cut.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_1st_order'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/fic_no_1st_order_cut.en.vec [INFO] <function Vectors.__init__ at 0x14f366170> ran in 5.076 seconds [INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.039 seconds
No neighbors COCA¶
COCA embeddings, but from training corpora from which the 25 nearest neighbors of each color and dimension word have been removed (in an attempt to disrupt the "scaffolding" that semantic associations with the colors and dimension words are built on).
We use two filtering regimes, a strong and a weak one. In the strong regime we remove every line that contains any neighbor word. In the weak regime we remove any of the following:
- Line that contains a color word and a neighbor of a dimension word.
- Line that contains a dimension word and a neighbor of a color word.
- Line that contains both a color word and a dimension word.
vecs = Vectors('../embeddings/fic_no_neighbors_strong_no1st.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_neighbors_strong'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
vecs = Vectors('../embeddings/fic_no_neighbors_weak_cut.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_neighbors_weak'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic_no_neighbors_strong_no1st.en.vec [INFO] <function Vectors.__init__ at 0x14f366170> ran in 5.011 seconds [INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.067 seconds [INFO] loading vectors ../embeddings/fic_no_neighbors_weak_cut.en.vec [INFO] <function Vectors.__init__ at 0x14f366170> ran in 4.467 seconds [INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.041 seconds
No names COCA¶
COCA embeddings, but from training corpora from which the labels generated by at least two participants for color-semantic associations (e.g. the label snow for the combination white and cold) has been removed. (These nameability data are explored in more detail in a section at the end of this notebook.)
# fiction
vecs = Vectors('../embeddings/fic_no_mediators_cut.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_mediators'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic_no_mediators_cut.en.vec [INFO] <function Vectors.__init__ at 0x14f366170> ran in 4.563 seconds [INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.037 seconds /var/folders/8h/k__12s992nbc7rmfv_w1rptc0000gp/T/ipykernel_62683/653768335.py:57: RuntimeWarning: invalid value encountered in divide return x / np.linalg.norm(x, 2)
df_joint.to_csv('data/data_plus_predictors.tsv', sep='\t')
Correlations between predictors¶
Correlations between predictors in original data (NZ)¶
df_orig = df_joint[df_joint['experiment'] == 'original']
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_no_1st_order',
'cosine_fic_no_neighbors_weak',
'cosine_fic_no_neighbors_strong',
'cosine_fic_no_mediators',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
Correlations between predictors in 2nd replication data (US)¶
df_rep = df_joint[df_joint['experiment'] == 'replication_2']
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_no_1st_order',
'cosine_fic_no_neighbors_weak',
'cosine_fic_no_neighbors_strong',
'cosine_fic_no_mediators',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
Correlations between predictors in full dataset¶
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_no_1st_order',
'cosine_fic_no_neighbors_weak',
'cosine_fic_no_neighbors_strong',
'cosine_fic_no_mediators',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
Standardize predictors and write to file¶
def standardize(Series):
return (Series - Series.mean()) / Series.std()
df_joint['art_z'] = standardize(df_joint['art'])
df_joint['fiction_z'] = standardize(df_joint['fiction'])
df_joint['nonfiction_z'] = standardize(df_joint['nonfiction'])
df_joint['reading_motivation_z'] = standardize(df_joint['reading_motivation'])
df_joint['reading_part_of_self_z'] = standardize(df_joint['reading_part_of_self'])
df_joint['reading_efficacy_z'] = standardize(df_joint['reading_efficacy'])
df_joint['reading_recognition_z'] = standardize(df_joint['reading_recognition'])
df_joint['reading_other_realms_z'] = standardize(df_joint['reading_other_realms'])
df_joint['rating_z'] = standardize(df_joint['rating'])
df_joint['frequency_z'] = standardize(df_joint['frequency'])
df_joint['concreteness_z'] = standardize(df_joint['concreteness'])
df_joint['swow_all_z'] = standardize(df_joint['swow_all'])
df_joint['swow_all_NZ_z'] = standardize(df_joint['swow_all_NZ'])
df_joint['swow_all_US_z'] = standardize(df_joint['swow_all_US'])
df_joint['swow_R1_z'] = standardize(df_joint['swow_R1'])
df_joint['swow_R1_NZ_z'] = standardize(df_joint['swow_R1_NZ'])
df_joint['swow_R1_US_z'] = standardize(df_joint['swow_R1_US'])
df_joint['cosine_cc_z'] = standardize(df_joint['cosine_cc'])
df_joint['cosine_subs_z'] = standardize(df_joint['cosine_subs'])
df_joint['cosine_acad_z'] = standardize(df_joint['cosine_acad'])
df_joint['cosine_fic_z'] = standardize(df_joint['cosine_fic'])
df_joint['cosine_mag_z'] = standardize(df_joint['cosine_mag'])
df_joint['cosine_news_z'] = standardize(df_joint['cosine_news'])
df_joint['cosine_spok_z'] = standardize(df_joint['cosine_spok'])
df_joint['cosine_fic_small_z'] = standardize(df_joint['cosine_fic_small'])
df_joint['cosine_fic_no_1st_order_z'] = standardize(df_joint['cosine_fic_no_1st_order'])
df_joint['cosine_fic_no_neighbors_weak_z'] = standardize(df_joint['cosine_fic_no_neighbors_weak'])
df_joint['cosine_fic_no_neighbors_strong_z'] = standardize(df_joint['cosine_fic_no_neighbors_strong'])
df_joint['cosine_fic_no_mediators_z'] = standardize(df_joint['cosine_fic_no_mediators'])
df_joint['blind'] = pd.get_dummies(df_joint['group'])['blind']
df_joint['sighted'] = pd.get_dummies(df_joint['group'])['sighted']
df_joint['group_eff'] = (df_joint['sighted'] - .5) * 2
df_joint['group_z'] = standardize(df_joint['sighted'])
df_joint['original'] = pd.get_dummies(df_joint['experiment'])['original']
df_joint['replication_1'] = pd.get_dummies(df_joint['experiment'])['replication_1']
df_joint['replication_2'] = pd.get_dummies(df_joint['experiment'])['replication_2']
df_joint['other'] = pd.get_dummies(df_joint['self_vs_other'])['other']
df_joint['self'] = pd.get_dummies(df_joint['self_vs_other'])['self']
df_joint['self_vs_other_eff'] = (df_joint['other'] - .5) * 2
df_joint['self_vs_other_z'] = standardize(df_joint['other'])
df_joint.to_csv('data/data_plus_predictors.tsv', sep='\t', index=False)
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | group_z | original | replication_1 | replication_2 | other | self | self_vs_other_eff | self_vs_other_z | cosine_fic_small | cosine_fic_small_z | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.211241 | True | False | False | False | True | -1.0 | -0.891882 | -0.031634 | -0.475593 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | 0.211241 | True | False | False | False | True | -1.0 | -0.891882 | 0.116445 | 1.327408 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.211241 | True | False | False | False | True | -1.0 | -0.891882 | 0.105395 | 1.192865 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.211241 | True | False | False | False | True | -1.0 | -0.891882 | -0.042038 | -0.602275 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.211241 | True | False | False | False | True | -1.0 | -0.891882 | -0.082589 | -1.096013 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.211241 | False | False | True | True | False | 1.0 | 1.121199 | -0.092165 | -1.212613 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.211241 | False | False | True | True | False | 1.0 | 1.121199 | 0.070999 | 0.774055 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | 0.211241 | False | False | True | True | False | 1.0 | 1.121199 | 0.079640 | 0.879268 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | 0.211241 | False | False | True | True | False | 1.0 | 1.121199 | 0.056388 | 0.596159 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | 0.211241 | False | False | True | True | False | 1.0 | 1.121199 | -0.064931 | -0.881009 |
42980 rows × 78 columns
df_joint = pd.read_csv('data/data_plus_predictors.tsv', sep='\t')
Nameability of color-dimension associations¶
def get_cosine_1word(x, vecs_dict):
zero = np.zeros(300)
return np.dot(vecs_dict.get(x['dimension'], zero), vecs_dict.get(x['color'], zero))
# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True) # not included in git repo
vecs_dict = vecs.as_dict()
df_names = pd.read_csv('data/color_dimension_nameability.csv')
display(df_names.head())
[INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x14494eca0> ran in 6.137 seconds [INFO] <function Vectors.as_dict at 0x14494eee0> ran in 0.044 seconds
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | happy_brown | happy | brown | 10 | 1.000000 | 0.800000 | 0.800000 | 0.044444 | 0.200000 | cat,puppy | 0.200000 | cat,puppy |
| 1 | unripe_brown | unripe | brown | 10 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... |
| 2 | hard_brown | hard | brown | 10 | 1.000000 | 0.900000 | 0.800000 | 0.044444 | 0.200000 | wood,rock | 0.200000 | wood |
| 3 | angry_blue | angry | blue | 13 | 1.076923 | 0.714286 | 0.714286 | 0.054945 | 0.230769 | shark | 0.230769 | shark |
| 4 | sad_brown | sad | brown | 10 | 1.100000 | 0.909091 | 0.909091 | 0.018182 | 0.200000 | cat | 0.200000 | cat |
# check how many participants provided labels for each color-adjective pair
print(df_names['number_responses'].min())
print(df_names['number_responses'].max())
display(df_names.sort_values('modal_agreement'))
7 13
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 99 | liked_blue | liked | blue | 13 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | sonic,sky,bird,pigeon,phone,smurfs,pencil,colo... | 0.076923 | sonic,sky,bird,pigeon,phone,smurfs,pencil,colo... |
| 40 | relaxed_blue | relaxed | blue | 13 | 1.076923 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | smurfette,meditation,bird,water,tranquility,st... | 0.076923 | smurfette,meditation,bird,water,tranquility,st... |
| 30 | submissive_blue | submissive | blue | 13 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | macaw,nun,bird,swallow,butterfly,flower,door,b... | 0.076923 | macaw,nun,bird,swallow,butterfly,flowers,door,... |
| 91 | old_blue | old | blue | 13 | 1.076923 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | bluecheese,necklace,bird,dress,shoe,smurfs,rug... | 0.076923 | bluecheese,necklace,bird,dress,shoes,smurfs,ru... |
| 192 | clean_yellow | clean | yellow | 12 | 1.083333 | 1.000000 | 1.000000 | 0.000000 | 0.083333 | table,detergant,sun,glove,hat,flag,ford,mustan... | 0.083333 | table,detergant,sun,gloves,hat,flag,ford.musta... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 151 | clean_white | clean | white | 9 | 1.111111 | 0.700000 | 0.600000 | 0.222222 | 0.555556 | sheet | 0.333333 | sheets |
| 170 | ripe_yellow | ripe | yellow | 12 | 1.000000 | 0.500000 | 0.500000 | 0.318182 | 0.583333 | banana | 0.583333 | banana |
| 68 | cold_blue | cold | blue | 13 | 1.000000 | 0.461538 | 0.461538 | 0.358974 | 0.615385 | ice | 0.615385 | ice |
| 147 | cold_white | cold | white | 9 | 1.000000 | 0.333333 | 0.333333 | 0.583333 | 0.777778 | snow | 0.777778 | snow |
| 158 | stale_white | stale | white | 9 | 1.000000 | 0.222222 | 0.222222 | 0.777778 | 0.888889 | bread | 0.888889 | bread |
306 rows × 12 columns
Exporting names generated by participants for use in training corpus filtering¶
names = df_names['modal_names']
names = list(chain(*[name.split(',') for name in names]))
names_all = set(names) # all unique names
names_count = Counter(names)
names_2plus = [name[0] for name in names_count.most_common() if name[1] >= 2] # all names that occur 2+ times
print(f'Number of labels named by at least 2 participants: {len(names_2plus)}')
with open('data/pair_labels_all.txt', 'w') as namesfile:
namesfile.write('\n'.join(names_all))
with open('data/pair_labels_2plus.txt', 'w') as namesfile:
namesfile.writelines('\n'.join(names_2plus))
# let's ignore words like "me", "my", and "a" though
Number of labels named by at least 2 participants: 242
Correlating COCA-fiction cosine similarities to nameability measures¶
Since we only have nameability for colors and dimension axis poles (i.e. for yellow and dislike but not yellow and dislike-like), we correlate nameability measures with cosine similarity between color and dimension axis pole.
pearsonr(df_names['simpson_diversity'], df_names['modal_agreement'])
PearsonRResult(statistic=0.8947743710654124, pvalue=1.816739746708339e-108)
df_names['cosine_fic'] = df_names.apply(lambda x: get_cosine_1word(x, vecs_dict), axis=1)
display(df_names.head())
x = pearsonr(df_names['cosine_fic'], df_names['simpson_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_names['cosine_fic'], df_names['modal_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
g = sns.lmplot(x='cosine_fic', y='simpson_diversity', data=df_names)
g = sns.lmplot(x='cosine_fic', y='modal_agreement', data=df_names)
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | cosine_fic | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | happy_brown | happy | brown | 10 | 1.000000 | 0.800000 | 0.800000 | 0.044444 | 0.200000 | cat,puppy | 0.200000 | cat,puppy | 0.142680 |
| 1 | unripe_brown | unripe | brown | 10 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.326845 |
| 2 | hard_brown | hard | brown | 10 | 1.000000 | 0.900000 | 0.800000 | 0.044444 | 0.200000 | wood,rock | 0.200000 | wood | 0.193040 |
| 3 | angry_blue | angry | blue | 13 | 1.076923 | 0.714286 | 0.714286 | 0.054945 | 0.230769 | shark | 0.230769 | shark | 0.160328 |
| 4 | sad_brown | sad | brown | 10 | 1.100000 | 0.909091 | 0.909091 | 0.018182 | 0.200000 | cat | 0.200000 | cat | 0.274516 |
pearsonr(cosine_fiction, simpson_diversity): 0.185, p-value: 0.001 pearsonr(cosine_fiction, modal_agreement): 0.203, p-value: 0.000
Correlating group-averaged human ratings to nameability measure differentials.¶
Since we do not have human ratings for the association between colors and dimension axis poles (only for association between colors and dimension axes), we need to collapse our nameability measures for the two poles of each dimension axis. One way to do this is to compute difference scores.
df_sighted = df_joint.loc[df_joint['group'] == 'sighted']
df_sighted['diversity_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['diversity_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['agreement_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['agreement_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['diff_diversity'] = (df_sighted['diversity_word1'] - df_sighted['diversity_word2'])
df_sighted['diff_agreement'] = (df_sighted['agreement_word1'] - df_sighted['agreement_word2'])
df_sighted = df_sighted.dropna()
display(df_sighted.head())
df_mean_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).mean().reset_index()
df_sd_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).std().reset_index()
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(rating, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | other | self | self_vs_other_eff | self_vs_other_z | diversity_word1 | diversity_word2 | agreement_word1 | agreement_word2 | diff_diversity | diff_agreement | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14468 | 0 | sighted | cold-hot | sighted_69212 | brown | 4 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.012821 | 0.142857 | 0.285714 | -0.012821 | -0.142857 |
| 14469 | 1 | sighted | ripe-unripe | sighted_69212 | brown | 7 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.238095 | 0.035714 | 0.428571 | 0.285714 | 0.202381 | 0.142857 |
| 14470 | 2 | sighted | new-old | sighted_69212 | brown | 6 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.000000 | 0.142857 | 0.142857 | 0.000000 | 0.000000 |
| 14471 | 3 | sighted | submissive-aggressive | sighted_69212 | brown | 2 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.000000 | 0.142857 | 0.142857 | 0.000000 | 0.000000 |
| 14472 | 4 | sighted | selfless-jealous | sighted_69212 | brown | 5 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.044444 | 0.142857 | 0.285714 | -0.044444 | -0.142857 |
5 rows × 82 columns
pearsonr(rating, simpson_diversity_difference): 0.036, p-value: 0.666 pearsonr(rating, modal_agreement_difference): -0.012, p-value: 0.890 pearsonr(cosine_fiction, simpson_diversity_difference): 0.091, p-value: 0.278 pearsonr(cosine_fiction, modal_agreement_difference): 0.032, p-value: 0.707
df_mean_sighted['rating_sd'] = df_sd_sighted['rating']
g = sns.lmplot(x='rating_sd', y='diff_diversity', data=df_mean_sighted)
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.280, p-value: 0.001 pearsonr(rating_sd, simpson_diversity): 0.280, p-value: 0.001
Correlation of group-averaged split-inverse ratings with nameability measures¶
One other way to work around the issue of having only color to dimension axis pole nameability is to split and invert the human ratings of color-dimension axis associations to create two scores per rating: One for the right end of the axis (equal to the rating), and one for the left end of the axis (equal to eight minus the rating). For example: If yellow is assigned a 6 on the scale dislike-like, the rating for yellow/like is 6, but we also create a rating of 2 for yellow/dislike.
df_inverse = df_sighted[[
'color',
'word1',
'rating',
'diversity_word1',
'agreement_word1'
]].rename(columns={
'word1': 'dimension',
'diversity_word1': 'simpson_diversity',
'agreement_word1': 'modal_agreement'
})
df_inverse['rating'] = 8 - df_inverse['rating']
df_inverse = pd.concat([df_inverse, df_sighted[[
'color',
'word2',
'rating',
'diversity_word2',
'agreement_word2'
]].rename(columns={
'word2': 'dimension',
'diversity_word2': 'simpson_diversity',
'agreement_word2': 'modal_agreement'
})])
display(df_inverse)
df_mean_inverse = df_inverse.groupby(['color', 'dimension']).mean().reset_index()
df_sd_inverse = df_inverse.groupby(['color', 'dimension']).std().reset_index()
x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
| color | dimension | rating | simpson_diversity | modal_agreement | |
|---|---|---|---|---|---|
| 14468 | brown | cold | 4 | 0.000000 | 0.142857 |
| 14469 | brown | ripe | 1 | 0.238095 | 0.428571 |
| 14470 | brown | new | 2 | 0.000000 | 0.142857 |
| 14471 | brown | submissive | 6 | 0.000000 | 0.142857 |
| 14472 | brown | selfless | 3 | 0.000000 | 0.142857 |
| ... | ... | ... | ... | ... | ... |
| 41138 | yellow | hard | 2 | 0.000000 | 0.125000 |
| 41139 | yellow | heavy | 2 | 0.000000 | 0.125000 |
| 41140 | yellow | tense | 2 | 0.000000 | 0.125000 |
| 41141 | yellow | dead | 2 | 0.000000 | 0.125000 |
| 41142 | yellow | slow | 2 | 0.000000 | 0.125000 |
46272 rows × 5 columns
pearsonr(rating, simpson_diversity): 0.062, p-value: 0.293 pearsonr(rating, modal_agreement): 0.070, p-value: 0.237
df_mean_inverse['rating_sd'] = df_sd_inverse['rating']
g = sns.lmplot(x='rating_sd', y='modal_agreement', data=df_mean_inverse)
g = sns.lmplot(x='rating_sd', y='simpson_diversity', data=df_mean_inverse)
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating_sd, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.228, p-value: 0.000 pearsonr(rating_sd, modal_agreement): 0.228, p-value: 0.000
In short: nameability (measured as simpson diversity and name agreement for the modal name) is weakly correlated with cosine similarity between colors and dimension axis poles, but not with human ratings, regardless of whether we fit the nameability to the ratings (by computing difference scores for the nameability measures) or fit the ratings to the nameability (by computing inverse ratings for the left poles of the dimension axes).
More figures¶
Mean color ratings on each dimension¶
df_viz = df_joint[df_joint['dimension'] != 'high-low']
df_means = df_viz.groupby(['dimension', 'color', 'word1', 'word2']).mean().reset_index()
dim_order = df_means.groupby('dimension').std().sort_values('rating', ascending=False).reset_index()['dimension']
df_means = df_means.set_index('dimension').loc[dim_order].reset_index()
mins_idx = df_means.groupby(['dimension'])['rating'].transform(min) == df_means['rating']
mins = df_means[mins_idx]
maxs_idx = df_means.groupby(['dimension'])['rating'].transform(max) == df_means['rating']
maxs = df_means[maxs_idx]
df_mins = mins[['word2', 'dimension', 'color']].merge(df_viz[['word2', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word2'])
df_maxs = maxs[['word1', 'dimension', 'color']].merge(df_viz[['word1', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word1'])
display(df_mins)
display(df_maxs)
sns.set_style('darkgrid')
all_colors = {color: color for color in df_viz['color']}
fig, ax1 = plt.subplots(figsize=(3, 8))
sns.pointplot(data=df_viz, y='word1', x='rating', hue='color',
palette=all_colors, join=False, dodge=False, ax=ax1, errorbar=('ci', .95))
ax2 = ax1.twinx()
sns.pointplot(data=df_viz, y='word2', x='rating', hue='color',
palette=all_colors, join=False, dodge=False, ax=ax2, errorbar=('ci', .95))
ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7]);
sns.set_style('whitegrid')
mins_colors = {color: color for color in mins['color']}
maxs_colors = {color: color for color in maxs['color']}
fig, ax1 = plt.subplots(figsize=(3, 7))
sns.violinplot(data=df_maxs, y='word1', x='rating', hue='color', #scale='width',
palette=maxs_colors, dodge=False, ax=ax1, inner=None, cut=0)
ax2 = ax1.twinx()
sns.violinplot(data=df_mins, y='word2', x='rating', hue='color', #scale='area',
palette=mins_colors, dodge=False, ax=ax2, inner=None, cut=0)
plt.setp(ax1.collections, alpha=.8)
plt.setp(ax2.collections, alpha=.8)
ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7])
plt.savefig('figures/color_ratings.pdf', bbox_inches='tight')
Scatterplot with connected points¶
sns.set_style('darkgrid')
df_blind = df_viz[df_viz['group'] == 'blind'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_sighted = df_viz[df_viz['group'] == 'sighted'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_scatter = pd.concat([df_blind, df_sighted])
df_scatter['colordim'] = df_scatter['color'] + '_' + df_scatter['dimension']
df_scatter = df_scatter.sort_values('cosine_fic_z')
means_colors = {row['color']: row['color'] for _, row in df_scatter.iterrows()}
g = sns.FacetGrid(df_scatter, hue='color', col='group', height=5, palette=means_colors, aspect=.5, sharex=True)
g.map(plt.scatter, 'cosine_fic_z', 'rating', s=10)
g.map(sns.regplot, 'cosine_fic_z', 'rating', scatter=False, ci=False)#, linewidth=.5)
g.set(xlabel='COCA-fiction\nembedding projection')
g.axes[0][0].set(ylabel='mean participant rating')
g.axes[0][0].set(title='blind')
g.axes[0][1].set(title='sighted')
g.set(ylim=[.75, 7.25], xlim=[-2.9, 2.9])
plt.savefig('figures/scatter_color.pdf', bbox_inches='tight')
df_sighted_mean = df_sighted.groupby(['dimension', 'color']).mean().reset_index()
df_blind_mean = df_blind.groupby(['dimension', 'color']).mean().reset_index()
df_ratings = df_sighted_mean[['dimension', 'color', 'rating']].merge(
df_blind_mean[['dimension', 'color', 'rating']], on=['dimension', 'color'], how='left'
).rename(columns={'rating_x': 'rating_sighted', 'rating_y': 'rating_blind'})
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(
x='rating_sighted',
y='rating_blind',
hue='color',
palette=all_colors,
legend=False,
ax=ax,
data=df_ratings
)
ax.set(ylabel='mean blind association rating', xlabel='mean sighted association rating',
ylim=[1, 7], xlim=[1, 7]);
def annotate(df, color, dimension, x=0, y=0):
plt.text(df.loc[(df['color'] == color) & (df['dimension'] == dimension), 'rating_sighted'].values[0] + x,
df.loc[(df['color'] == color) & (df['dimension'] == dimension), 'rating_blind'].values[0] + y,
f'{color} on {dimension}', fontdict={'size': 'small'})
annotate(df_ratings, 'white', 'clean-dirty', -1.95, -.05)
annotate(df_ratings, 'blue', 'cold-hot', .05, -.20)
annotate(df_ratings, 'red', 'cold-hot', .1, -.05)
annotate(df_ratings, 'orange', 'cold-hot', .05, +.05)
annotate(df_ratings, 'black', 'cold-hot', -1.75, -.05)
annotate(df_ratings, 'red', 'relaxed-tense', .1, -.05)
#df_ratings.apply(lambda row: annotate(df_ratings, row['color'], row['dimension']), axis=1)
plt.savefig('figures/blind_vs_sighted_scatter.pdf', bbox_inches='tight')
df_viz = df_joint[df_joint['dimension'] != 'high-low']
#df_viz = df_viz[df_viz.group == 'sighted']
all_colors = {color: color for color in df_viz['color']}
sns.set(style='darkgrid')
fig, axes = plt.subplots(3, 6, sharex=True, sharey=True)
axes[2, 5].set_axis_off()
df_panels = df_viz[['dimension', 'color', 'rating', 'cosine_fic_z']].groupby(['dimension', 'color']).mean().reset_index()
for i, dimension in enumerate(df_panels.dimension.unique()):
sns.scatterplot(
x='rating',
y='cosine_fic_z',
hue='color',
palette=all_colors,
data=df_panels[df_panels.dimension == dimension],
ax=axes[i // 6, i % 6],
legend=False
)
"""
sns.regplot(
x='rating',
y='cosine_fic_z',
scatter=False,
color='gray',
data=df_viz[df_viz.dimension == dimension],
ax=axes[i // 6, i % 6]
)
"""
axes[i // 6, i % 6].set(
title=dimension,
xticks=range(2, 7),
yticks=range(-2, 3),
xlabel=None,
ylabel=None
)
axes[1, 0].set(ylabel='COCA-fiction embedding projection')
axes[2, 2].set(xlabel='mean participant rating')
plt.savefig('figures/rating_vs_cosine_scatter.pdf', bbox_inches='tight')
Convert notebook to html¶
convert_notebook('data_prep')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 1 ----> 1 convert_notebook('data_prep') NameError: name 'convert_notebook' is not defined